## Setup, Load Libraries & Report Versions
knitr::opts_chunk$set(echo = TRUE)
## Load Libraries -------------------
## IMPORTANT!
## Please check the paths and files in the file "load_packages_and_functions.R" before running this!
## All .R files must be in the same folder, specified below in 'lib_folder'
lib_folder = "~/ad-omics/ricardo/MyRepo/structuralvariation/R_Library/"
source(paste0(lib_folder,"load_packages_and_functions.R"))
createDT <- function(DF, caption="", scrollY=500){
data <- DT::datatable(DF, caption=caption,
extensions = 'Buttons',
class = "display",
callback = JS("return table;"),
filter = c("none", "bottom", "top"),
escape = TRUE,
style = "default", width = NULL, height = NULL, elementId = NULL,
fillContainer = getOption("DT.fillContainer", NULL),
autoHideNavigation = getOption("DT.autoHideNavigation", NULL),
selection = c("multiple", "single", "none"),
plugins = NULL, editable = FALSE,
options = list( dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
scrollY = scrollY, scrollX=T, scrollCollapse = T, paging = T,
columnDefs = list(list(className = 'dt-center', targets = "_all"))
)
)
return(data)
}Structural Variations (SVs) are increasingly recognized for their importance in genomics. Short-read sequencing is the most widely-used approach for genotyping large numbers of samples for SVs but suffers from relatively poor accuracy. SVCollector, is an open-source method that optimally selects samples to maximize variant discovery and validation using long read resequenc- ing or PCR-based validation.
SVCollector has two major ranking modes: topN, and greedy. For the topN mode, it picks samples with the largest number of SVs irrespective if the SVs are shared with other samples. For the greedy mode, it finds a set of samples that collectively contain the largest number of distinct variants. Solving this exactly is computationally intractable as it is a version of the well-known NP-hard set cover problem. Consequently, it uses a greedy approximation thatstarts with the sample with the largest number of variants, and then iteratively picks the sample containing the largest number variants not yet included in the set. It also has a random mode that mimics an arbitrary selection process, and is helpful for evaluating the diversity of the topN or greedy approaches. For each mode, SVCollector reports the rank, sample name, its unique contribution of SVs, the cumulative sum of SVs up to the chosen sample, and the cumulative percentage compared to the total number of SVs in the input VCF file.
root_dir = "~/ad-omics/ricardo/"
svcollector_path = paste0(root_dir,"MyApps/SVCollector/Debug/SVCollector")
svcollector_script = paste0(root_dir,"MyApps/SVCollector/SVCollector.sh")
ROSMAP_path = paste0(root_dir,"ROSMAP/")
gt_merged_ROSMAP = paste0(ROSMAP_path,"05_MergedSamples/fromSMOOVE/samples_merged_DEL.Final.vcf.gz")
raw_merged_ROSMAP = paste0(ROSMAP_path,"03_MergedCalls/SURVIVOR/population/samples_merged_DEL.vcf")
ROSMAP_Ancestry = read.csv(paste0(ROSMAP_path,"01_GeneticPC/pca.ancestry_prediction.csv"), header = T, stringsAsFactors = F, check.names = F, comment.char = "")
ROSMAP_Euro = ROSMAP_Ancestry[ROSMAP_Ancestry$EUR==1,]
MSBB_path = paste0(root_dir,"MSinai/")
gt_merged_MSBB = paste0(MSBB_path,"05_MergedSamples/fromSMOOVE/samples_merged_DEL.Final.vcf.gz")
raw_merged_MSBB = paste0(MSBB_path,"03_MergedCalls/SURVIVOR/population/samples_merged_DEL.vcf")
MSBB_Ancestry = read.csv(paste0(MSBB_path,"01_GeneticPC/pca.ancestry_prediction.csv"), header = T, stringsAsFactors = F, check.names = F, comment.char = "")
MSBB_Euro = MSBB_Ancestry[MSBB_Ancestry$EUR==1,]
# Outputs
work_dir = paste0(root_dir,"AMP_AD/LongReadsValidation.FINAL/")
system(paste0("mkdir -p ",work_dir))
gt_vcf_ROSMAP = paste0(work_dir,"ROSMAP.gt.vcf")
raw_vcf_ROSMAP = paste0(work_dir,"ROSMAP.raw.vcf")
gt_vcf_MSBB = paste0(work_dir,"MSBB.gt.vcf")
raw_vcf_MSBB = paste0(work_dir,"MSBB.raw.vcf")ROSMAP_samples = system(paste0("ml bcftools; bcftools query -l ", gt_merged_ROSMAP), intern = T)
ROSMAP_samples_to_use = unique(ROSMAP_Euro[ROSMAP_Euro$`#sample`%in%ROSMAP_samples,"#sample"])
MSBB_samples = system(paste0("ml bcftools; bcftools query -l ", gt_merged_MSBB), intern = T)
MSBB_samples_to_use = unique(MSBB_Euro[MSBB_Euro$`#sample`%in%MSBB_samples,"#sample"])sysout = system(paste0("ml bcftools; bcftools view --force-samples -s ", paste0(ROSMAP_samples_to_use, collapse = ","), " ", raw_merged_ROSMAP, " | bcftools sort -o ", raw_vcf_ROSMAP), intern = T)
sysout = system(paste0("ml bcftools; bcftools view --force-samples -s ", paste0(ROSMAP_samples_to_use, collapse = ","), " ", gt_merged_ROSMAP, " | bcftools sort -o ", gt_vcf_ROSMAP), intern = T)
sysout = system(paste0("ml bcftools; bcftools view --force-samples -s ", paste0(MSBB_samples_to_use, collapse = ","), " ", raw_merged_MSBB, " | bcftools sort -o ", raw_vcf_MSBB), intern = T)
sysout = system(paste0("ml bcftools; bcftools view --force-samples -s ", paste0(MSBB_samples_to_use, collapse = ","), " ", gt_merged_MSBB, " | bcftools sort -o ", gt_vcf_MSBB), intern = T)
sysout = system(paste("ml parallel;",svcollector_script, raw_vcf_ROSMAP, length(ROSMAP_samples_to_use), work_dir), intern = T)
sysout = system(paste("ml parallel;",svcollector_script, gt_vcf_ROSMAP, length(ROSMAP_samples_to_use), work_dir), intern = T)
sysout = system(paste("ml parallel;",svcollector_script, raw_vcf_MSBB, length(MSBB_samples_to_use), work_dir), intern = T)
sysout = system(paste("ml parallel;",svcollector_script, gt_vcf_MSBB, length(MSBB_samples_to_use), work_dir), intern = T)#raw_ROSMAP = read.vcfR(raw_vcf_ROSMAP, verbose = F)
#gt_ROSMAP = read.vcfR(gt_vcf_ROSMAP, verbose = F)
#raw_MSBB = read.vcfR(raw_vcf_MSBB, verbose = F)
#gt_MSBB = read.vcfR(gt_vcf_MSBB, verbose = F)cohort = "ROSMAP"Samples ancestries
Getting only Europeans
a = plot_samples_ancestry(ROSMAP_samples_to_use, label="ROSMAP-EUR")